{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 处理结构化数据代码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# coding=utf8\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "import random\n",
    "import numpy as np\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = pd.read_csv('task3/task3/train.csv')\n",
    "val_ids = pickle.load(open('data/val_ids_%d.pkl' % 18, 'rb'))\n",
    "train_ids = list(set(train_data['id']) - set(val_ids))\n",
    "train_val_data_unique = train_data.drop_duplicates()\n",
    "train_val_data_unique.index = train_val_data_unique['id']\n",
    "test_data = pd.read_csv('task3_test_stage1_new.csv')\n",
    "test_data.index = test_data['id']\n",
    "test_ids = list(test_data['id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(64, 3902)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "same_id = set(test_ids) & (set(val_ids) | set(train_ids))\n",
    "len(same_id), len(test_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "target = train_val_data_unique['label']\n",
    "train_val_data_unique = train_val_data_unique.drop(['label'], axis=1)\n",
    "all_data = train_val_data_unique.append(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = all_data.fillna(-1)\n",
    "province_list = ['北京', '天津', '河北', '山西', '内蒙古', '辽宁', '吉林','黑龙江', '上海', '江苏', '浙江', '安徽', '福建', '江西', '山东',\\\n",
    "                '河南', '湖北', '湖南', '广东', '广西', '海南', '重庆', '四川', '贵州', '云南', '西藏', '陕西', '甘肃', '青海', '宁夏',\\\n",
    "                '新疆', '台湾', '香港', '澳门']\n",
    "def get_province(x):\n",
    "    \"\"\"\n",
    "    \"\"\"\n",
    "    if x == -1:\n",
    "        return -1\n",
    "    if '其他' in x:\n",
    "        return x\n",
    "    if '海外' in x:\n",
    "        return x\n",
    "    for province in province_list:\n",
    "        if province in x:\n",
    "            return province\n",
    "province = all_data.userLocation.apply(get_province)\n",
    "\n",
    "feature = ['userFollowCount', 'userFansCount', 'userWeiboCount', 'feature_len_piclist', 'feature_userGender', 'feature_userLocation', \\\n",
    "           'feature_userProvince', 'feature_len_userDescription', 'feature_category']\n",
    "all_data['feature_len_piclist'] = all_data.piclist.apply(lambda x : (-1 if x == -1 else len(x.split('\\t'))))\n",
    "all_data['feature_userGender'] = all_data.userGender.apply(lambda x : {'男': 0, '女':1, -1:-1}[x])\n",
    "all_data['feature_userLocation'] = pd.factorize(all_data.userLocation)[0]\n",
    "all_data['feature_userProvince'] = pd.factorize(province)[0]\n",
    "all_data['feature_len_userDescription'] = all_data.userDescription.apply(lambda x : (-1 if x == -1 else len(x)))\n",
    "all_data['feature_category'] = pd.factorize(all_data.category)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import lightgbm as lgb\n",
    "param = {\n",
    "    'bagging_freq': 5,\n",
    "    'bagging_fraction': 0.335,\n",
    "    'boost_from_average':'false',\n",
    "    'boost': 'gbdt',\n",
    "    'feature_fraction': 0.041,\n",
    "    'learning_rate': 0.0083,\n",
    "    'max_depth': -1,\n",
    "    'metric':'auc',\n",
    "    'min_data_in_leaf': 80,\n",
    "    'min_sum_hessian_in_leaf': 10.0,\n",
    "    'num_leaves': 13,\n",
    "    'num_threads': 8,\n",
    "    'tree_learner': 'serial',\n",
    "    'objective': 'binary', \n",
    "    'verbosity': -1\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = all_data.iloc[:-len(test_data)].loc[train_ids]\n",
    "val = all_data.iloc[:-len(test_data)].loc[val_ids]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27250, 27250)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train), len(target.loc[train_ids])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 4000 rounds.\n",
      "[5000]\ttraining's auc: 0.977394\tvalid_1's auc: 0.974985\n",
      "[10000]\ttraining's auc: 0.980208\tvalid_1's auc: 0.977126\n",
      "[15000]\ttraining's auc: 0.981504\tvalid_1's auc: 0.978044\n",
      "[20000]\ttraining's auc: 0.982287\tvalid_1's auc: 0.978424\n",
      "[25000]\ttraining's auc: 0.982836\tvalid_1's auc: 0.978616\n",
      "Early stopping, best iteration is:\n",
      "[23394]\ttraining's auc: 0.982685\tvalid_1's auc: 0.978707\n"
     ]
    }
   ],
   "source": [
    "trn_data = lgb.Dataset(train[feature], label=target.loc[train_ids])\n",
    "val_data = lgb.Dataset(val[feature], label=target.loc[val_ids])\n",
    "clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([0.91363502, 0.911739  ]), array([0.91067217, 0.91466978]), array([0.91215119, 0.91320204]), array([3392, 3422], dtype=int64))\n",
      "0.9126766154238151\n"
     ]
    }
   ],
   "source": [
    "val_score = clf.predict(val[feature], num_iteration=clf.best_iteration)\n",
    "pre = val_score > 0.5\n",
    "from sklearn.metrics import precision_recall_fscore_support\n",
    "from sklearn.metrics import f1_score\n",
    "\n",
    "print(precision_recall_fscore_support(target.loc[val_ids], pre))\n",
    "print(f1_score(target.loc[val_ids], pre, average='macro'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([0.94514768, 0.64583333]), array([0.86821705, 0.82666667]), array([0.90505051, 0.7251462 ]), array([258,  75], dtype=int64))\n",
      "0.8588588588588588\n"
     ]
    }
   ],
   "source": [
    "uncofidence_ids = pickle.load(open('data/unconfidence_ids.pkl', 'rb'))\n",
    "uncofidence_val = val.loc[uncofidence_ids]\n",
    "unconfidence_val_score = clf.predict(uncofidence_val[feature], num_iteration=clf.best_iteration)\n",
    "unconfidence_val_pre = unconfidence_val_score > 0.5\n",
    "print(precision_recall_fscore_support(target.loc[uncofidence_ids], unconfidence_val_pre))\n",
    "from sklearn.metrics import accuracy_score\n",
    "print(accuracy_score(unconfidence_val_pre, target.loc[uncofidence_ids]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data['label'] = clf.predict(all_data.iloc[-len(test_data):][feature]) > 0.5\n",
    "test_data['label'] = test_data['label'].astype('float')\n",
    "test_data[['id', 'label']].to_csv('submit_user_info.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    2088\n",
       "1.0    1814\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data['label'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<lightgbm.basic.Booster at 0x2610d5fba58>"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.save_model('lgb_model.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>piclist</th>\n",
       "      <th>userGender</th>\n",
       "      <th>userFollowCount</th>\n",
       "      <th>userFansCount</th>\n",
       "      <th>userWeiboCount</th>\n",
       "      <th>userLocation</th>\n",
       "      <th>userDescription</th>\n",
       "      <th>category</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>84cdcfed1aeb7047ad168be7bb9b559e</td>\n",
       "      <td>回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：</td>\n",
       "      <td>NaN</td>\n",
       "      <td>男</td>\n",
       "      <td>1728.0</td>\n",
       "      <td>748.0</td>\n",
       "      <td>30884.0</td>\n",
       "      <td>北京朝阳区</td>\n",
       "      <td>我们虚度的今天恰是。。昨天去世人渴望的明天。</td>\n",
       "      <td>文体娱乐</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>93ebc056c547618b5b00ab35270c9fad</td>\n",
       "      <td>//分享网易新闻:《发生在昆明的火锅店老板“辱滇门”，云南人该愤怒还是羞愧》|发生在昆明.....</td>\n",
       "      <td>63ad082a189566eed7c4bb3e4bc55012.jpg</td>\n",
       "      <td>男</td>\n",
       "      <td>423.0</td>\n",
       "      <td>112.0</td>\n",
       "      <td>792.0</td>\n",
       "      <td>云南楚雄</td>\n",
       "      <td>用心生活</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>eefcba3b5856fe8f55213e036ee463ca</td>\n",
       "      <td>西宁城管围殴民警扬言要把警察打死|西宁城管围...</td>\n",
       "      <td>4986dc2a5f09a87c7af5dfc57d7775cd.jpg</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>8e09b1b13477f62139b5cd7a7a7dcb8f</td>\n",
       "      <td>【川航航班因驾驶舱风挡破裂安全备降成都】今天上午6:26从重庆江北国际机场出发前往拉萨的四川...</td>\n",
       "      <td>dcfccfc69e90a0007afd6aafa1385e56.jpg</td>\n",
       "      <td>女</td>\n",
       "      <td>1668.0</td>\n",
       "      <td>7470000.0</td>\n",
       "      <td>57256.0</td>\n",
       "      <td>北京东城区</td>\n",
       "      <td>中国青年报•中青在线微博</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9ffea4d9573c6e723e8f178a789888dc</td>\n",
       "      <td>支持郑强！！！//【贵州大学校长回应空姐言论:常给她们写感谢信】</td>\n",
       "      <td>NaN</td>\n",
       "      <td>男</td>\n",
       "      <td>267.0</td>\n",
       "      <td>61.0</td>\n",
       "      <td>1098.0</td>\n",
       "      <td>江苏盐城</td>\n",
       "      <td>能烧得全烧了，只剩下石头。</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 id  \\\n",
       "0  84cdcfed1aeb7047ad168be7bb9b559e   \n",
       "1  93ebc056c547618b5b00ab35270c9fad   \n",
       "2  eefcba3b5856fe8f55213e036ee463ca   \n",
       "3  8e09b1b13477f62139b5cd7a7a7dcb8f   \n",
       "4  9ffea4d9573c6e723e8f178a789888dc   \n",
       "\n",
       "                                                text  \\\n",
       "0             回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：   \n",
       "1  //分享网易新闻:《发生在昆明的火锅店老板“辱滇门”，云南人该愤怒还是羞愧》|发生在昆明.....   \n",
       "2                          西宁城管围殴民警扬言要把警察打死|西宁城管围...   \n",
       "3  【川航航班因驾驶舱风挡破裂安全备降成都】今天上午6:26从重庆江北国际机场出发前往拉萨的四川...   \n",
       "4                   支持郑强！！！//【贵州大学校长回应空姐言论:常给她们写感谢信】   \n",
       "\n",
       "                                piclist userGender  userFollowCount  \\\n",
       "0                                   NaN          男           1728.0   \n",
       "1  63ad082a189566eed7c4bb3e4bc55012.jpg          男            423.0   \n",
       "2  4986dc2a5f09a87c7af5dfc57d7775cd.jpg        NaN              NaN   \n",
       "3  dcfccfc69e90a0007afd6aafa1385e56.jpg          女           1668.0   \n",
       "4                                   NaN          男            267.0   \n",
       "\n",
       "   userFansCount  userWeiboCount userLocation         userDescription  \\\n",
       "0          748.0         30884.0        北京朝阳区  我们虚度的今天恰是。。昨天去世人渴望的明天。   \n",
       "1          112.0           792.0         云南楚雄                    用心生活   \n",
       "2            NaN             NaN          NaN                     NaN   \n",
       "3      7470000.0         57256.0        北京东城区            中国青年报•中青在线微博   \n",
       "4           61.0          1098.0         江苏盐城           能烧得全烧了，只剩下石头。   \n",
       "\n",
       "  category  label  \n",
       "0     文体娱乐      0  \n",
       "1     社会生活      0  \n",
       "2     社会生活      0  \n",
       "3     社会生活      0  \n",
       "4     社会生活      0  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "nan"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data.piclist.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(38471, 21832)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_data), train_data.piclist.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16639"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "38471 - 21832"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "uncofidence_ids = pickle.load(open('data/unconfidence_ids.pkl', 'rb'))\n",
    "uncofidence_val = train_val_data_unique.loc[uncofidence_ids]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "212"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "uncofidence_val.piclist.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "333"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(uncofidence_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
